This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(tidyplots)
Attaching package: ‘tidyplots’
The following object is masked from ‘package:ggpubr’:
gene_expression
#ATCC genomes updated version
#October 2024
#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data
####read counts/normalised read counts####
#metadata
metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
#normalise by both raw read count and genome length - should be the same as mean read depth
counts_reads_norm <- counts_reads |>
mutate(norm_counts1 = matched / QC_reads) |>
mutate(norm_counts2 = matched / QC_reads / length) |>
mutate(norm_counts3 = matched / length) |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
depth_nodedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
)
Rows: 95 Columns: 12── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (5): virus, Sample_id, Background, mapper, type
dbl (7): Viral load, mean_depth, median_depth, min, max, LQ, UQ
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
read_depths <-
depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log(mean_depth),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10() +
ylab("Log(mean read depth)")
#ggsave("figures/compare_spike_ins_atcc/mean_depth.pdf",width=8,height=6)
ggarrange(
read_count,
read_count_norm1,
read_count_norm2,
read_depths,
nrow = 2,
ncol = 2,
common.legend = TRUE,
align = "hv"
)
Error: object 'read_count' not found
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
#normalise by both raw read count and genome length - should be the same as mean read depth
counts_reads_norm <- counts_reads |>
mutate(norm_counts1 = matched / QC_reads) |>
mutate(norm_counts2 = matched / QC_reads / length) |>
mutate(norm_counts3 = matched / length) |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
####compare library capture pool####
cols3 <- c("#228833", "#AA3377")
facet_names <- c("dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated")
metadata3 <-
metadata |>
select(Sample.ID, Pool.for.sequencing) |>
rename(Sample_id = Sample.ID) |>
rename(Pool = Pool.for.sequencing)
counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")
pool_readcounts <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(x = virus, y = log(matched))) +
geom_boxplot() +
facet_grid(Pool ~ type) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (Viral Reads)") + xlab("Viral load (copies / ml)") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))
pool_counts_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (Viral Reads)") + xlab("Viral load (copies / ml)") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))
pool_readcounts_norm <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(x = virus, y = log(norm_counts1))) +
geom_boxplot() +
facet_grid(Pool ~ type) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log(viral reads/cleaned reads)") + xlab("Viral load (copies / ml)")
pool_norm1_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log(viral reads/cleaned reads)") + xlab("Viral load (copies / ml)") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))
pool_readcounts_norm2 <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(x = virus, y = log(norm_counts2))) +
geom_boxplot() +
facet_grid(Pool ~ type) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log(viral reads/cleaned reads/genome length)") + xlab("Viral load (copies / ml)")
pool_norm2_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log(viral reads/cleaned reads/genome length)") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) +
xlab("Viral load (copies / ml)")
#import and combine read depth files
depth_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depth_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)
depths_reads <-
left_join(depths_bt_all, metadata2, by = "Sample_id") |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")
pool_readdepths <-
depths_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(x = virus, y = log(mean_depth))) +
geom_boxplot() +
facet_grid(Pool ~ type) +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (mean read depth)") + xlab("Viral load (copies / ml)")
pool_depths_sum <-
depths_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log(mean_depth),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (mean read depth)") +
xlab("Viral load (copies / ml)") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))
ggarrange(
pool_counts_sum,
pool_norm1_sum,
pool_norm2_sum,
pool_depths_sum,
nrow = 2,
ncol = 2,
common.legend = TRUE,
align = "hv"
)
#ggsave("figures/compare_spike_ins_atcc/pool_compare_viruses_sum.png",width=10,height=7)
#ggsave("figures/manuscript_figures_pdf/FigureS2.pdf",width=10,height=7)
##plots of read counts and viral read counts
#November 2024
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import read count files
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
#import viral reads mapped (to calculate proportions)
viral_reads_dedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
header = TRUE
)
viral_reads_nodedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
header = TRUE
)
cols2 <- c("#BB5566", "#004488")
facet_names <- c("dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated")
reads_metadata_dedup <-
left_join(counts_dedup_bt, metadata2, by = "Sample_id")
reads_viral_dedup <-
left_join(reads_metadata_dedup, viral_reads_dedup, by = "Sample_id")
reads_metadata_nodedup <-
left_join(counts_nodedup_bt, metadata2, by = "Sample_id")
reads_viral_nodedup <-
left_join(reads_metadata_nodedup, viral_reads_nodedup, by = "Sample_id")
reads_viral_all <- rbind(reads_viral_dedup, reads_viral_nodedup)
reads_plot <-
reads_viral_all |>
group_by(Background, Sample_id, Viral.load, type) |>
summarise(
total_reads = (QC_reads * 2),
viral_reads = total_virus_reads,
ATCC_reads = sum(matched),
prop_ATCC = ATCC_reads / total_reads,
prop_viral = total_virus_reads / total_reads,
diff = viral_reads - ATCC_reads
) |>
unique()
####read counts/depths split by background####
#change labels in facet plots
facet_names_bg <- c(
"dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated",
"p2" = "P1",
"p8" = "P2"
)
#break down by Background x virus
background_counts_reads <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log(matched),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels =
c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(labels = scales::label_log()) +
ylab("Log(Viral Reads)")
backgrounds_counts_reads_norm <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log(norm_counts1),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_bw() +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(label = scales::label_log()) +
ylab("Log(viral reads/cleaned reads)")
backgrounds_counts_reads_norm2 <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log(norm_counts2),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(labels = scales::label_log()) +
ylab("Log(viral reads/cleaned reads/genome length)")
background_read_depths <-
depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log(mean_depth),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
axis.title.y = element_text(size = 10),
legend.title = element_blank()
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(label = scales::label_log()) +
ylab("Log(mean read depth)")
depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)
depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
,
.default = "RNA"
))
####linear model for spike in viruses - mean read depth####
depths_reads_sub <- depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
mutate(log_depth = log(mean_depth)) |>
mutate(log_viral_load = log10(Viral.load))
#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)
summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)
lm_tidy <-
do.call(rbind.data.frame, tidy(list_models))
Error: No tidy method recognized for this list.
#plots of TE experiment data - genome coverage & per site coverage
#also separated by viruses
#ATCC genomes updated version
#October 2024
#use bowtie2 data
####genome coverage####
unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_TE/")
unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv.zip", exdir = "data_TE/")
dedup_per_site <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
)
nodedup_per_site <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
)
file.remove("data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")
file.remove( "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv")
#add length column from read depth file to per site file
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <-
metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
)
counts_nodedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
#normalise by both raw read count and genome length - should be the same as mean read depth
counts_reads_norm <-
counts_reads |>
mutate(norm_counts1 = matched / QC_reads) |>
mutate(norm_counts2 = matched / QC_reads / length) |>
mutate(norm_counts3 = matched / length) |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
lengths <-
counts_reads_norm |>
select(virus, length) |>
distinct()
#calculate genome coverage
persite_coverage_dedup <-
dedup_per_site |>
rename(Viral.load = `Viral load`) |>
group_by(Sample_id, virus, Viral.load, Background, type) |>
filter(coverage > 0) |>
summarise(genome_coverage = n()) |>
left_join(lengths) |>
mutate(percent_coverage = genome_coverage / length)
persite_coverage_nodedup <-
nodedup_per_site |>
rename(Viral.load = `Viral load`) |>
group_by(Sample_id, virus, Viral.load, Background, type) |>
filter(coverage > 0) |>
summarise(genome_coverage = n()) |>
left_join(lengths) |>
mutate(percent_coverage = genome_coverage / length)
persite_coverage_both <-
rbind(persite_coverage_dedup,
persite_coverage_nodedup)
coverage_labels <- c("0" = "Control",
"100" = "10^{2}",
"1000" = "10^{3}",
"10000" = "10^{5}",
"dedup_TE" = "Deduplicated",
"nodedup_TE"= "Non-Deduplicated")
coverage_labels2 <- c("0" = "Control",
"100" = "10^{2}",
"1000" = "10^{3}",
"10000" = "10^{5}")
persite_both <- rbind(dedup_per_site, nodedup_per_site)
persite_reads <- left_join(persite_both, metadata2, by = "Sample_id")
persite_norm <- persite_reads |>
mutate(norm_cov = coverage / QC_reads)
coverage_labels3 <-c("100" = "1e+02",
"1000" = "1e+03",
"100000" = "1e+05",
"p2" = "P1",
"p8"= "P2")
persite_norm$`Viral load` <-
factor(persite_norm$`Viral load`,
labels = c("0",
"10^{2}",
"10^{3}",
"10^{5}"))
persite_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
filter(virus == "Human_respiratory_syncytial_virus") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(x = site, y = coverage, fill = Background)) +
geom_col() +
facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
ylab("Log(Coverage)") +
ggtitle("Human RSV (deduplicated)") +
guides(fill = guide_legend(title = "Background")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols2, labels = c("P1", "P2"))
#ggsave("figures/manuscript_figures_pdf/FigureS5.pdf",width=10,height=8)
persite_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
filter(virus == "Zika_virus") |>
ggplot(aes(x = site, y = coverage, fill = Background)) +
geom_col() +
facet_wrap(Viral.load ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Zika virus (deduplicated)") +
guides(fill = guide_legend(title = "Background")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols2, labels = c("P1", "P2"))
#ggsave("figures/manuscript_figures_pdf/FigureS6.pdf",width=10,height=8)
persite_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
filter(virus == "Human_adenovirus_40") |>
ggplot(aes(x = site, y = coverage, fill = Background)) +
geom_col() +
facet_wrap(Viral.load ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Human Adenovirus (deduplicated)") +
guides(fill = guide_legend(title = "Background")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols2, labels = c("P1", "P2"))
#ggsave("figures/manuscript_figures_pdf/FigureS7.pdf",width=20,height=18)
persite_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
filter(virus == "Human_betaherpesvirus") |>
ggplot(aes(x = site, y = coverage, fill = Background)) +
geom_polygon() +
facet_wrap(Viral.load ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Human Betaherpesvirus (deduplicated)") +
guides(fill = guide_legend(title = "Background")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols2, labels = c("P1", "P2"))
#ggsave("figures/compare_spike_ins_atcc/HBHV_coverage_test.pdf",width=20,height=18)
#have to do these differently due to segmentation
FLU_ME1 <- persite_norm |>
filter(virus == "Influenza_B_virus") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Influenza B virus (Background 1 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols4,
labels = c("1e+02", "1e+03", "1e+05"))
FLU_ME2 <- persite_norm |>
filter(virus == "Influenza_B_virus") |>
filter(type == "dedup_TE") |>
filter(Background == "p8") |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Influenza B virus (Background 2 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols4,
labels = c("1e+02", "1e+03", "1e+05"))
ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)
#ggsave("figures/manuscript_figures_pdf/FigureS9.pdf",width=20,height=18)
REO_ME1 <- persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Mammalian orthoreovirus 3 (Background 1 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols4,
labels = c("1e+02", "1e+03", "1e+05"))
REO_ME2 <- persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p8") |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Mammalian orthoreovirus 3 (Background 2 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols4,
labels = c("1e+02", "1e+03", "1e+05"))
ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)
#ggsave("figures/manuscript_figures_pdf/FigureS10.pdf",width=20,height=18)
# read in metadata
metadata <-
read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)
metadata2 <-
metadata |>
select(
Sample.ID,
Background.sample,
Viral.load,
Number.of.read.pairs..quality.adaptor.trimmed.
) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import viral reads mapped (to calculate proportions)
viral_reads_dedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
header = TRUE
)
viral_reads_nodedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
header = TRUE
)
#import contingency tables
contingency_dedup <-
read.csv(
"data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_dedup_atcc_ref_20241106.csv",
header = TRUE
)
contingency_nodedup <-
read.csv(
"data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_nodedup_atcc_ref_20241106.csv",
header = TRUE
)
contaminants <-
c("Betacoronavirus", "Alphainfluenzavirus", "Gammaretrovirus")
# pivot longer
dedup_long <-
contingency_dedup |>
filter(!genus %in% contaminants) |>
filter(genus != "NA") |>
pivot_longer(cols = A:P,
names_to = "Sample_id",
values_to = "count") |>
mutate(
target = case_when(
genus == "Cyclovirus" ~ "off_target",
genus == "Cardiovirus" ~ "off_target",
genus == "Kobuvirus" ~ "off_target",
.default = "on_target"
)
) |>
mutate(genome_structure = case_when((genus == "Mastadenovirus" |
genus == "Cytomegalovirus") ~ "DNA",
.default = "RNA"
))
no_dedup_long <- contingency_nodedup |>
filter(!genus %in% contaminants) |>
filter(genus != "NA") |>
pivot_longer(cols = A:P,
names_to = "Sample_id",
values_to = "count") |>
mutate(
target = case_when(
genus == "Cyclovirus" ~ "off_target",
genus == "Cardiovirus" ~ "off_target",
genus == "Kobuvirus" ~ "off_target",
.default = "on_target"
)
) |>
mutate(genome_structure = case_when((genus == "Mastadenovirus" |
genus == "Cytomegalovirus") ~ "DNA",
.default = "RNA"
))
metadata_dedup <- left_join(dedup_long, metadata2, by = "Sample_id")
dedup_viral_reads <-
left_join(metadata_dedup, viral_reads_dedup, by = "Sample_id") |>
mutate(total_reads = QC_reads * 2) |>
mutate(prop_total = count / total_reads) |>
mutate(prop_viral = count / total_virus_reads)
metadata_no_dedup <- left_join(no_dedup_long, metadata2, by = "Sample_id")
nodedup_viral_reads <-
left_join(metadata_no_dedup, viral_reads_nodedup, by = "Sample_id") |>
mutate(total_reads = QC_reads * 2) |>
mutate(prop_total = count / total_reads) |>
mutate(prop_viral = count / total_virus_reads)